## Toy example with conspiracy data

## IGNORE THIS -- it's code for me
## This is the code to downsample to just the data for the toy example
#load("C:/Users/Richard Nielsen/Dropbox/conspiracy_files/ConspDataNew.RData")
#ls()
#tmpdat1 <- texts_dt[!is.na(texts_dt$conspiracy_training),]
#tmpdat2 <- texts_dt[which(is.na(texts_dt$conspiracy_training))[1:1000],]
#texts_dt <- rbind(tmpdat1, tmpdat2)
#dim(texts_dt)
#
#ls()
#rm("metadata","my_terms","sum1_dt","sum2_dt","tmpdat1","tmpdat2")
#gc()
#source("C:/Users/Richard Nielsen/Desktop/Professional Stuff/mit/grad advisement/Michael Freedman/text analysis toy example/light10_24feb2014.R")
#save.image("C:\\Users\\Richard Nielsen\\Desktop\\Professional Stuff\\mit\\grad advisement\\Michael Freedman\\text analysis toy example\\conspiracy toy example workspace_2apr201r.RData")

## START HERE
rm(list=ls())
load("C:\\Users\\Richard Nielsen\\Desktop\\Professional Stuff\\mit\\grad advisement\\Michael Freedman\\text analysis toy example\\conspiracy toy example workspace_2apr201r.RData")

# texts_dt: the main dataset, with the variables I discussed in my previous email
#  $ path               : chr  "ahram/extractedfeb2015/2009-12-27/subindex10articleEconomy_News_1186.txt" "ahram/extractedfeb2015/2009-12-27/subindex10articleEconomy_News_1186.txt" "ahram/extractedfeb2015/2009-12-27/subindex10articleEconomy_News_1186.txt" "ahram/extractedfeb2015/2009-12-27/subindex10articleEconomy_News_1186.txt" ...
#  $ newspaper          : chr  "ahram" "ahram" "ahram" "ahram" ...
#  $ date               : IDate, format: "2009-12-27" "2009-12-27" ...
#  $ year               : int  2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
#  $ month              : chr  "December" "December" "December" "December" ...
#  $ day                : chr  "Sunday" "Sunday" "Sunday" "Sunday" ...
#  $ paragraph_num      : int  1 2 3 4 5 1 2 3 4 5 ...
#  $ paragraph_full     : chr  "Original Arabic here"| __truncated__ ...
#  $ paragraph_clean    : chr  "Cleaned Arabic here"| __truncated__ ...
#  $ num_words          : num  7 4 38 244 99 7 4 26 11 9 ...
#  $ conspiracy_naive   : int  0 0 0 0 0 0 0 0 0 0 ...
#  $ conspiracy_training: int  NA NA NA NA NA NA NA NA NA NA ...
# 
# where "conspiracy_naive" is a dichotomous variable indicating whether the paragraph contains a conspiracy keyword (1) or not (0), and where "conspiracy_training" is a dichotomous variable indicating whether the paragraph is in our training set of conspiracy paragraphs (1), in our training set of non-conspiracy paragraphs (0), or neither (NA).
# 
# In the "clean" paragraphs, I have removed Arabic presentation non-characters and RTL markers, all non-Arabic text, abjad numerals, common diacritics, punctuation, and any extra spaces. But I have not run your stemmer on it yet.

dim(texts_dt)
table(texts_dt$conspiracy_naive)
table(texts_dt$conspiracy_training)

texts_dt$paragraph_clean[1:10]


## fits random forest model to dtmComp 
## creates model object that can be saved and used for predictions

#use caret library
library(caret)
library(tm)

## get just the training set
tdat <- texts_dt

dim(tdat)

system.time(
mycorpus <- VCorpus(VectorSource(tdat$paragraph_clean))
)

system.time(
## this creates a document-term matrix
dtm <- DocumentTermMatrix(mycorpus, 
         control = list(tolower=F, stemming = F, stopwords = F, minWordLength = 3,
         removeNumbers = F, removePunctuation = F))
)

dim(dtm)
## remove uncommon terms
dtm2 <- removeSparseTerms(dtm, 0.97)  ## very few terms with this cutoff -- just for example to make the code run faster
dim(dtm2)
## remove common words
dtmCommon <- removeSparseTerms(dtm, 0.95)  
dim(dtmCommon)
colnames(dtmCommon)
dtm2 <- dtm2[,colnames(dtm2)[!colnames(dtm2) %in% colnames(dtmCommon)]]
dim(dtm2)
## keep the fairly common but not too common words
keepterms <- unique(c(colnames(dtm2)))
dtm3 <- dtm[,keepterms]
dtm <- dtm3
dim(dtm)  ## We're using very few predictors here!!!!!!
rm(dtm3,dtm2,dtmCommon)
gc()

## some quick changes to the matrix.
dtm <- as.data.frame(as.matrix(dtm))
dtm <- apply(dtm,MAR=2,as.numeric)
## add rownames to the dtm
rownames(dtm) <- rownames(tdat)
head(dtm)
## transliterate column names (this is code from my stemmer)
colnames(dtm) <- transliterate(colnames(dtm))
## are there duplicates now? (induced by transliteration)
table(duplicated(colnames(dtm)))
## No

trainingDatComp <- as.data.frame(dtm)

##make Class in dtmComp training data
trainingDatComp$Class <- NA
trainingDatComp$Class[tdat$conspiracy_training==1] <- "conspiracy"
trainingDatComp$Class[tdat$conspiracy_training==0] <- "not"
trainingDatComp$Class <- as.factor(trainingDatComp$Class)

head(trainingDatComp)
trainingDatComp$Class
## separate out the coded data from the uncoded data
testDatComp <- trainingDatComp[is.na(trainingDatComp$Class),]
trainingDatComp <- trainingDatComp[!is.na(trainingDatComp$Class),]
dim(testDatComp)
dim(trainingDatComp)

## Confusion Matrix code to check models as they're tuned
confMatPred <- function(x,y) {
	#x: model; y: complete data (all vars and class)
	traindta <- y[,-ncol(y)]
	class <- as.numeric(y[,ncol(y)])
	predClass <- as.numeric(predict(x, newdata=traindta))
	confusionMatrix(predClass, class)
}

##########CURRENT BEST: random forest with tuning grid:

set.seed(123)

#may want to adjust b in floor(seq(a, b, length=c)) now that num obs increased
mtryVals <- floor(seq(10, 200, length = 10))
mtryGrid <- data.frame(.mtry = mtryVals)


set.seed(12355)
inTrain <- createDataPartition(y = trainingDatComp$Class,
  ## the outcome data are needed
  p = .80,
  ## The percentage of data in the
  ## training set
  list = FALSE)

training <- trainingDatComp[ inTrain,]
testing <- trainingDatComp[-inTrain,]
dim(training)
dim(testing)


set.seed(021)
system.time(
rfFitCompBack <- train(Class ~ ., data = training,
                 method = "rf",
                 tuneGrid = mtryGrid,
                 ntree = 1000,
                 imortance = TRUE,
                 trControl = trainControl(# 10 fold rf
                 				method = "oob",
                 				number = 10,
                 				#repeat 10x
                 				repeats = 10,
                 				classProbs = TRUE,
                 				selectionFunction = "tolerance")
                 )
)  ## End system.time

#   user  system elapsed 
# 155.59    0.40  156.13 
155/60  ## 2.6 minutes to run on my machine


## Prediction within the 80 percent of the training set that we used
## to train the classifier         
rfFitCompBack
# accuracy?
confMatPred(rfFitCompBack, trainingDatComp)
# num false positives/negatives?

## prediction in the 20 percent of the training set that we held back
rfClasses <- predict(rfFitCompBack, newdata = testing)
confusionMatrix(data = rfClasses, testing$Class)
## That's pretty bad!  It's because we limited the words so dramatically so none of the predictors
## are actually related to predicting conspiracy or not.

## Predict to the uncoded test set
rfPrediction <- predict(rfFitCompBack, newdata = testDatComp)
table(rfPrediction)  
names(rfPrediction) <- rownames(testDatComp)
## then match rfPrediction up with the data using the rownames (which stay the same in the way I do the code).




